In [118]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import LabelEncoder
In [119]:
INPUT_PATH = Path("datasets/bfar.csv")
DF = pd.read_csv(INPUT_PATH)
DF.head()
Out[119]:
| RESPONSE | A1:AREA | A2:GROUP | B3:AGE | B5:SEX | B6:M-STATUS | B7:EDUCATION | B8:HH_SIZE | C1:TOT_INCOME/A | C2:INCOME/B/FISH | ... | J6:AVE_FBP-PERC | J7.1 | J7.2 | J7.3 | J7.4 | J7.5 | J7:AVE_FBP-CONT | K:COMMENTS | CD: P_SCORE | CV: PS_WT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | ABULUG | 1 | 33 | 1 | 2 | 3 | 2 | 1 | 1 | ... | 4.67 | 3.0 | 3.0 | 5.0 | 5.0 | 5.0 | 4.2 | NaN | NaN | NaN |
| 1 | 2 | ABULUG | 1 | 57 | 1 | 3 | 3 | 2 | 1 | 1 | ... | 5.00 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | NaN | NaN | NaN |
| 2 | 3 | ABULUG | 1 | 45 | 1 | 2 | 3 | 2 | 1 | 1 | ... | 4.83 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | NaN | NaN | NaN |
| 3 | 4 | ABULUG | 1 | 32 | 1 | 1 | 3 | 3 | 1 | 1 | ... | 4.33 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | NaN | NaN | NaN |
| 4 | 5 | ABULUG | 1 | 38 | 1 | 1 | 3 | 1 | 1 | 1 | ... | 5.00 | 1.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 | NaN | NaN | NaN |
5 rows × 215 columns
In [120]:
print("Data Frame Shape", DF.shape)
Data Frame Shape (1339, 215)
In [121]:
# Check if there is missing values
MISSING_DATA = DF.isnull().sum()
MISSING_DATA[MISSING_DATA > 0]
Out[121]:
Y_BOAT-RE 740 NY_W/BOAT 740 BOAT_COND 740 J1:BOAT_AGREE 740 J2:BOAT_TYPE 740 J3:BOAT_DESIGN 740 J4:BOAT_COND 740 J4: REASON-NO 740 J5.1 740 J5.2 740 J5.3 740 J5.4 740 J5.5 740 J5.6 740 J5.7 740 J6:AVE_FBP-IMPT 740 J6.1 740 J6.2 740 J6.3 740 J6.4 740 J6.5 740 J6.6 740 J6:AVE_FBP-PERC 740 J7.1 740 J7.2 740 J7.3 740 J7.4 740 J7.5 740 J7:AVE_FBP-CONT 740 K:COMMENTS 1339 CD: P_SCORE 1339 CV: PS_WT 1339 dtype: int64
In [122]:
sns.set(style="whitegrid")
# Before
ROWS_BEFORE = len(DF)
print(f"Rows before dropping duplicates: {ROWS_BEFORE}")
# After
DF = DF.drop_duplicates()
ROWS_AFTER = len(DF)
print(f"Remaining rows after dropping duplicates: {ROWS_AFTER}")
Rows before dropping duplicates: 1339 Remaining rows after dropping duplicates: 1339
In [123]:
DATA = {
'Stage': ['Before Dropping Duplicates', 'After Dropping Duplicates'],
'Row Count': [ROWS_BEFORE, ROWS_AFTER]
}
PLOT_DF = pd.DataFrame(DATA)
plt.figure(figsize=(8, 6))
sns.barplot(x='Stage', y='Row Count', hue='Stage', data=PLOT_DF, palette=['lightblue', 'lightblue'], dodge=False)
plt.title('Effect of Dropping Duplicate Rows', fontsize=14, pad=10)
plt.xlabel('Stage', fontsize=12)
plt.ylabel('Number of Rows', fontsize=12)
plt.legend([],[], frameon=False)
plt.tight_layout()
plt.show()
In [124]:
print(f"Total columns with missing values: {len(MISSING_DATA)}")
Total columns with missing values: 215
In [125]:
MISSING_PERCENT = DF.isnull().mean().sort_values(ascending=False)
MISSING_COUNT = DF.isnull().sum()[MISSING_PERCENT.index] # Get counts in same order
ax = MISSING_PERCENT[MISSING_PERCENT > 0].plot(
kind='barh',
figsize=(10, 8),
color='lightblue',
edgecolor='black',
title='Missing Values per Column (Descending)',
width=0.8
)
for index, (col, percent) in enumerate(MISSING_PERCENT[MISSING_PERCENT > 0].items()):
count = MISSING_COUNT[col]
label = f"{percent:.1%} ({count:,})"
ax.text(
percent + 0.01,
index,
label,
va='center',
fontsize=9
)
plt.xlabel('Missing Percentage', fontsize=12)
plt.ylabel('Columns', fontsize=12)
plt.xlim(0, 1.1)
plt.grid(axis='x', linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()
In [126]:
# Fix missing values
NUM_COLS = DF.select_dtypes(include='number').columns
for COL in NUM_COLS:
MEDIAN_VALUE = DF[COL].median()
DF[COL] = DF[COL].fillna(MEDIAN_VALUE)
c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\nanfunctions.py:1215: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims) c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\nanfunctions.py:1215: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims) c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\nanfunctions.py:1215: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims)
In [127]:
# Check if there is still missing values
MISSING_DATA = DF.isnull().sum()
MISSING_DATA[MISSING_DATA > 0]
Out[127]:
K:COMMENTS 1339 CD: P_SCORE 1339 CV: PS_WT 1339 dtype: int64
In [128]:
MISSING_PERCENT = DF.isnull().mean().sort_values(ascending=False)
MISSING_COUNT = DF.isnull().sum()[MISSING_PERCENT.index]
ax = MISSING_PERCENT[MISSING_PERCENT > 0].plot(
kind='barh',
figsize=(10, 8),
color='lightblue',
edgecolor='black',
title='Missing Values per Column (Descending)',
width=0.8
)
for index, (col, percent) in enumerate(MISSING_PERCENT[MISSING_PERCENT > 0].items()):
count = MISSING_COUNT[col]
label = f"{percent:.1%} ({count:,})"
ax.text(
percent + 0.01,
index,
label,
va='center',
fontsize=9
)
plt.xlabel('Missing Percentage', fontsize=12)
plt.ylabel('Columns', fontsize=12)
plt.xlim(0, 1.1)
plt.grid(axis='x', linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()
In [129]:
# Drop columns with 50% or more missing values
THRESHOLD_COL = len(DF) * 0.5
DF = DF.dropna(axis=1, thresh=THRESHOLD_COL)
In [130]:
# Check if column with 50% or more missing values already dropped
DF
Out[130]:
| RESPONSE | A1:AREA | A2:GROUP | B3:AGE | B5:SEX | B6:M-STATUS | B7:EDUCATION | B8:HH_SIZE | C1:TOT_INCOME/A | C2:INCOME/B/FISH | ... | J6.4 | J6.5 | J6.6 | J6:AVE_FBP-PERC | J7.1 | J7.2 | J7.3 | J7.4 | J7.5 | J7:AVE_FBP-CONT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | ABULUG | 1 | 33 | 1 | 2 | 3 | 2 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 3.0 | 3.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1 | 2 | ABULUG | 1 | 57 | 1 | 3 | 3 | 2 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 5.00 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 |
| 2 | 3 | ABULUG | 1 | 45 | 1 | 2 | 3 | 2 | 1 | 1 | ... | 4.0 | 5.0 | 5.0 | 4.83 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 |
| 3 | 4 | ABULUG | 1 | 32 | 1 | 1 | 3 | 3 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.33 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 |
| 4 | 5 | ABULUG | 1 | 38 | 1 | 1 | 3 | 1 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 5.00 | 1.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1334 | 1335 | SARANGANI | 0 | 52 | 1 | 2 | 2 | 1 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1335 | 1336 | SARANGANI | 0 | 63 | 1 | 2 | 2 | 2 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1336 | 1337 | SARANGANI | 0 | 55 | 1 | 2 | 2 | 2 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1337 | 1338 | SARANGANI | 0 | 43 | 1 | 2 | 2 | 1 | 2 | 2 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1338 | 1339 | SARANGANI | 0 | 42 | 1 | 2 | 3 | 1 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
1339 rows × 212 columns
In [131]:
# Convert all strings to lowercase
TEXT_COLS = DF.select_dtypes(include='object').columns
for COL in TEXT_COLS:
DF[COL] = DF[COL].astype(str).str.lower().str.strip()
In [132]:
# check if all strings converted to lowercase
DF
Out[132]:
| RESPONSE | A1:AREA | A2:GROUP | B3:AGE | B5:SEX | B6:M-STATUS | B7:EDUCATION | B8:HH_SIZE | C1:TOT_INCOME/A | C2:INCOME/B/FISH | ... | J6.4 | J6.5 | J6.6 | J6:AVE_FBP-PERC | J7.1 | J7.2 | J7.3 | J7.4 | J7.5 | J7:AVE_FBP-CONT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | abulug | 1 | 33 | 1 | 2 | 3 | 2 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 3.0 | 3.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1 | 2 | abulug | 1 | 57 | 1 | 3 | 3 | 2 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 5.00 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 |
| 2 | 3 | abulug | 1 | 45 | 1 | 2 | 3 | 2 | 1 | 1 | ... | 4.0 | 5.0 | 5.0 | 4.83 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 |
| 3 | 4 | abulug | 1 | 32 | 1 | 1 | 3 | 3 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.33 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 |
| 4 | 5 | abulug | 1 | 38 | 1 | 1 | 3 | 1 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 5.00 | 1.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1334 | 1335 | sarangani | 0 | 52 | 1 | 2 | 2 | 1 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1335 | 1336 | sarangani | 0 | 63 | 1 | 2 | 2 | 2 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1336 | 1337 | sarangani | 0 | 55 | 1 | 2 | 2 | 2 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1337 | 1338 | sarangani | 0 | 43 | 1 | 2 | 2 | 1 | 2 | 2 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
| 1338 | 1339 | sarangani | 0 | 42 | 1 | 2 | 3 | 1 | 1 | 1 | ... | 5.0 | 5.0 | 5.0 | 4.67 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.2 |
1339 rows × 212 columns
In [133]:
# DIMENSIONALITY REDUCTION
# In this process i will train 3 models
# - PCA
# - Entropy Weighted K-Means
# - Auto Encoder
# and choose the best performing model for identifyin feature weights with the metrics of
# - Silhouette Score
# - Calinski-Harabasz Index
# - Davies-Bouldin Score
In [134]:
# Model configuration
K = 3
DF_NUM = DF.select_dtypes(include='number').copy()
SCALER = MinMaxScaler()
X_SCALED = SCALER.fit_transform(DF_NUM)
In [135]:
# Create a list to hold all metrics
METRICS_SUMMARY = []
def evaluate_clustering(X, LABELS, NAME):
SILHOUETTE = silhouette_score(X, LABELS)
CALINSKI = calinski_harabasz_score(X, LABELS)
DAVIES = davies_bouldin_score(X, LABELS)
METRICS_SUMMARY.append({
'Model': NAME,
'Silhouette': SILHOUETTE,
'Calinski-Harabasz': CALINSKI,
'Davies-Bouldin': DAVIES
})
RESULTS_DF = pd.DataFrame([{
'Model': NAME,
'Silhouette': f"{SILHOUETTE:.4f}",
'Calinski-Harabasz': f"{CALINSKI:.4f}",
'Davies-Bouldin': f"{DAVIES:.4f}"
}]).set_index('Model')
print(f"\n{NAME} Clustering Evaluation:")
display(RESULTS_DF.style
.set_properties(**{'text-align': 'center'})
.format(precision=4))
In [136]:
# PCA
PCA_MODEL = PCA(n_components=2)
X_PCA = PCA_MODEL.fit_transform(X_SCALED)
KMEANS_PCA = KMeans(n_clusters=K, random_state=42, n_init=10)
LABELS_PCA = KMEANS_PCA.fit_predict(X_PCA)
evaluate_clustering(X_PCA, LABELS_PCA, "PCA")
PCA Clustering Evaluation:
| Silhouette | Calinski-Harabasz | Davies-Bouldin | |
|---|---|---|---|
| Model | |||
| PCA | 0.3980 | 1362.5074 | 0.8724 |
In [137]:
# Autoencoder
INPUT_DIM = X_SCALED.shape[1]
INPUT_LAYER = Input(shape=(INPUT_DIM,))
ENCODED = Dense(16, activation='relu')(INPUT_LAYER)
BOTTLENECK = Dense(2, activation='linear')(ENCODED)
DECODED = Dense(16, activation='relu')(BOTTLENECK)
OUTPUT_LAYER = Dense(INPUT_DIM, activation='sigmoid')(DECODED)
AUTOENCODER = Model(INPUT_LAYER, OUTPUT_LAYER)
AUTOENCODER.compile(optimizer=Adam(learning_rate=0.01), loss='mse')
AUTOENCODER.fit(X_SCALED, X_SCALED, epochs=100, batch_size=32, verbose=0,
callbacks=[EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)])
ENCODER = Model(inputs=INPUT_LAYER, outputs=BOTTLENECK)
X_AE = ENCODER.predict(X_SCALED)
KMEANS_AE = KMeans(n_clusters=K, random_state=42, n_init=10)
LABELS_AE = KMEANS_AE.fit_predict(X_AE)
evaluate_clustering(X_AE, LABELS_AE, "Autoencoder")
42/42 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step Autoencoder Clustering Evaluation:
| Silhouette | Calinski-Harabasz | Davies-Bouldin | |
|---|---|---|---|
| Model | |||
| Autoencoder | 0.4738 | 1577.7962 | 0.8216 |
In [138]:
# Entropy Weighted K-Means
def entropy_weights(X):
X_NORM = X / (X.sum(axis=0) + 1e-9)
X_NORM = np.where(X_NORM == 0, 1e-9, X_NORM)
ENTROPY = -np.sum(X_NORM * np.log(X_NORM), axis=0) / np.log(len(X))
D = 1 - ENTROPY
WEIGHTS = D / D.sum()
return WEIGHTS
WEIGHTS = entropy_weights(X_SCALED)
X_ENTROPY = X_SCALED * WEIGHTS
KMEANS_ENTROPY = KMeans(n_clusters=K, random_state=42, n_init=10)
LABELS_ENTROPY = KMEANS_ENTROPY.fit_predict(X_ENTROPY)
evaluate_clustering(X_ENTROPY, LABELS_ENTROPY, "Entropy-Weighted")
Entropy-Weighted Clustering Evaluation:
| Silhouette | Calinski-Harabasz | Davies-Bouldin | |
|---|---|---|---|
| Model | |||
| Entropy-Weighted | 0.1686 | 86.6478 | 3.0035 |
In [139]:
METRICS_DF = pd.DataFrame(METRICS_SUMMARY)
SCORING_DF = METRICS_DF.copy()
# Invert Davies-Bouldin because lower is better
SCORING_DF['Inverse Davies-Bouldin'] = 1 / (SCORING_DF['Davies-Bouldin'] + 1e-6)
# Scale all metrics between 0 and 1
SCALER = MinMaxScaler()
SCALED_METRICS = SCALER.fit_transform(SCORING_DF[['Silhouette', 'Calinski-Harabasz', 'Inverse Davies-Bouldin']])
# Add composite score (mean of scaled metrics)
SCORING_DF['Composite Score'] = SCALED_METRICS.mean(axis=1)
# Find the best model
BEST_MODEL = SCORING_DF.loc[SCORING_DF['Composite Score'].idxmax()]
# Show metrics summary
print("\nOverall Clustering Metrics Summary:")
display(METRICS_DF)
Overall Clustering Metrics Summary:
| Model | Silhouette | Calinski-Harabasz | Davies-Bouldin | |
|---|---|---|---|---|
| 0 | PCA | 0.398025 | 1362.507416 | 0.872423 |
| 1 | Autoencoder | 0.473812 | 1577.796218 | 0.821649 |
| 2 | Entropy-Weighted | 0.168575 | 86.647821 | 3.003505 |
In [140]:
print("\nBest Performing Model")
display(pd.DataFrame({
'': ['Model', 'Composite Score'],
'Best Performance': [BEST_MODEL['Model'], f"{BEST_MODEL['Composite Score']:.4f}"]
}).style.hide(axis='index'))
Best Performing Model
| Best Performance | |
|---|---|
| Model | Autoencoder |
| Composite Score | 1.0000 |
In [141]:
# Plot scattered distribution in each model to check if the best performing model is reasonable
plt.figure(figsize=(15, 4))
plt.subplot(1, 3, 1)
sns.scatterplot(x=X_PCA[:, 0], y=X_PCA[:, 1], hue=LABELS_PCA, palette="tab10")
plt.title("PCA + KMeans")
plt.subplot(1, 3, 2)
sns.scatterplot(x=X_AE[:, 0], y=X_AE[:, 1], hue=LABELS_AE, palette="tab10")
plt.title("Autoencoder + KMeans")
plt.subplot(1, 3, 3)
sns.scatterplot(x=X_ENTROPY[:, 0], y=X_ENTROPY[:, 1], hue=LABELS_ENTROPY, palette="tab10")
plt.title("Entropy Weighted + KMeans")
plt.tight_layout()
plt.show()
In [142]:
print("\nExtracting Feature Weights from Best Model...\n")
if BEST_MODEL['Model'] == "PCA":
PCA_WEIGHTS = np.abs(PCA_MODEL.components_[:2]).mean(axis=0)
FEATURE_WEIGHTS = pd.Series(PCA_WEIGHTS, index=DF_NUM.columns).sort_values(ascending=False)
elif BEST_MODEL['Model'] == "Autoencoder":
ENCODER_WEIGHTS = np.abs(AUTOENCODER.layers[1].get_weights()[0])
SUMMED_WEIGHTS = ENCODER_WEIGHTS.sum(axis=1)
FEATURE_WEIGHTS = pd.Series(SUMMED_WEIGHTS, index=DF_NUM.columns).sort_values(ascending=False)
elif BEST_MODEL['Model'] == "Entropy-Weighted":
FEATURE_WEIGHTS = pd.Series(WEIGHTS, index=DF_NUM.columns).sort_values(ascending=False)
Extracting Feature Weights from Best Model...
In [143]:
# Show top features
print("📌 Top Contributing Features:")
display(FEATURE_WEIGHTS.head(10))
📌 Top Contributing Features:
D2/A:Ind_APP 11.972435 D1/A:Ind_VEH 11.090771 D1/B:Ind_VEH 10.557349 D2.7-A_QTY 8.336466 D2/B:Ind_APP 8.041806 D2.7:A_STOVE 8.025345 H:Ind_ASWS 7.619016 D3.4-A_QTY 7.268851 D3.4:A_OTHERS 7.225698 D3.1:B_CP 7.176279 dtype: float32
In [144]:
plt.figure(figsize=(10, 5))
FEATURE_WEIGHTS.head(10).plot(kind='barh')
plt.gca().invert_yaxis()
plt.title(f"Top 10 Feature Weights - {BEST_MODEL['Model']}")
plt.xlabel("Weight")
plt.tight_layout()
plt.show()
In [145]:
print("All Feature Weights:")
display(FEATURE_WEIGHTS.to_frame(name="Weight").style
.background_gradient(cmap='Blues', subset=['Weight'])
.format({'Weight': '{:.4f}'})
.set_properties(**{'text-align': 'center'})
.set_caption(f"Feature Importance ({BEST_MODEL['Model']})"))
All Feature Weights:
| Weight | |
|---|---|
| D2/A:Ind_APP | 11.9724 |
| D1/A:Ind_VEH | 11.0908 |
| D1/B:Ind_VEH | 10.5573 |
| D2.7-A_QTY | 8.3365 |
| D2/B:Ind_APP | 8.0418 |
| D2.7:A_STOVE | 8.0253 |
| H:Ind_ASWS | 7.6190 |
| D3.4-A_QTY | 7.2689 |
| D3.4:A_OTHERS | 7.2257 |
| D3.1:B_CP | 7.1763 |
| D2.9-A_QTY | 7.0621 |
| D2.10:A_OTHERS | 7.0049 |
| D2.9:A_FURNITURE | 6.8864 |
| D2.3-B_QTY | 6.8597 |
| C1:TOT_INCOME/A | 6.6398 |
| C2:INCOME/B/FISH | 6.4967 |
| J4: REASON-NO | 6.4826 |
| D3.1-B_QTY | 6.3860 |
| J4:BOAT_COND | 6.3384 |
| D2.10-A_QTY | 6.2314 |
| C4:INCOME/B/ALT | 6.2273 |
| D2.3:B_WASH-M | 6.1217 |
| D2.3:A_WASH-M | 5.8258 |
| D3.3-A_QTY | 5.7777 |
| D2.6:A_FRIDGE | 5.7259 |
| D3/B:YC_GAD | 5.6129 |
| D1.7:A_OTHERS | 5.6088 |
| NY_W/BOAT | 5.5187 |
| G2:B_GSIS | 5.5068 |
| D2.1-B_QTY | 5.4941 |
| D1.3:A_TRICYCLE | 5.4154 |
| D2.6-A_QTY | 5.4007 |
| D1.3-A_QTY | 5.3903 |
| D3/B:AC_GAD | 5.3257 |
| A2:GROUP | 5.3242 |
| D2.3-A_QTY | 5.3200 |
| D1.4:A_CAR | 5.3064 |
| D3.3:A_COMPUTER | 5.2791 |
| G2:A_GSIS | 5.1508 |
| D1.3-B_QTY | 5.1173 |
| D1.5:B_JEEP | 5.0887 |
| D3.2:B_LANDLINE | 5.0857 |
| J2:BOAT_TYPE | 5.0698 |
| D1.4-A_QTY | 5.0177 |
| D2.1:B_TV | 4.9475 |
| D3.2:A_LANDLINE | 4.9201 |
| D1.7-A_QTY | 4.8969 |
| D2.7:B_STOVE | 4.8598 |
| G4:B_PN-IN | 4.8166 |
| H2:RET_P | 4.8120 |
| D2.7-B_QTY | 4.7763 |
| E5:B_NET-SUBS | 4.7589 |
| G4:A_PN-IN | 4.7491 |
| D1.6-B_QTY | 4.7351 |
| D3/A:IndGAD | 4.7251 |
| D1.2-A_QTY | 4.6605 |
| D1.2:A_MOTORC | 4.6564 |
| D1.3:B_TRICYCLE | 4.6467 |
| D1.5-B_QTY | 4.6424 |
| I4:TFA | 4.5250 |
| D1.5:A_JEEP | 4.4779 |
| H3:SPES | 4.4556 |
| G5:B_LIFE-IN | 4.4346 |
| H7:AS_P | 4.3253 |
| D1.6:B_TRUCK | 4.2873 |
| H5:TBE | 4.2441 |
| D3.2-A_QTY | 4.2151 |
| D1.5-A_QTY | 4.1950 |
| H6:F_PC | 4.1888 |
| D2.4-A_QTY | 4.1476 |
| D2.10-B_QTY | 4.1426 |
| D2.2-A_QTY | 4.1202 |
| I6.2:FT | 4.1182 |
| J3:BOAT_DESIGN | 4.0760 |
| G6:A_HEALTH-IN | 4.0727 |
| D3.1:A_CP | 4.0673 |
| H8:E/CW_P | 4.0638 |
| D2.10:B_OTHERS | 4.0612 |
| I5:TFV | 4.0178 |
| B5:SEX | 4.0003 |
| I8.4:FISH_COMP | 3.9715 |
| I8.3:BOAT_P | 3.9552 |
| D3.3-B_QTY | 3.9343 |
| D2.9:B_FURNITURE | 3.9335 |
| D1.4-B_QTY | 3.9324 |
| D2/A:YC_APP | 3.8778 |
| D1.6:A_TRUCK | 3.8764 |
| D2.4:A_AC | 3.8565 |
| D3.1-A_QTY | 3.8130 |
| D3/B:IndGAD | 3.8128 |
| D1.1-A_QTY | 3.8087 |
| E4:A_COOK-FUEL | 3.7855 |
| D1/A:AC_VEH | 3.7781 |
| I7.2:LIFE_B | 3.7562 |
| B8:HH_SIZE | 3.7556 |
| D1.2-B_QTY | 3.7203 |
| D1.1:A_BIKE | 3.7079 |
| I2:A/C_M | 3.6716 |
| D2.9-B_QTY | 3.6578 |
| D1.6-A_QTY | 3.6078 |
| D3/A:AC_GAD | 3.5855 |
| D2.5-B_QTY | 3.5096 |
| G5:A_LIFE-IN | 3.5005 |
| D3.3:B_COMPUTER | 3.4910 |
| J7.2 | 3.4822 |
| D1/B:AC_VEH | 3.4589 |
| G6:B_HEALTH-IN | 3.4456 |
| D2.6-B_QTY | 3.4389 |
| E5:A_NET-SUBS | 3.4294 |
| D2.4-B_QTY | 3.4258 |
| J5.3 | 3.3975 |
| D3.4:B_OTHERS | 3.3949 |
| D2.8:A_E-HEATER | 3.3770 |
| D2.1-A_QTY | 3.3633 |
| H4:AL_P | 3.3624 |
| C5:TOT_INCOME/B | 3.3542 |
| D1.2:B_MOTORC | 3.3385 |
| D3.2-B_QTY | 3.3171 |
| I8.5:OTHERS | 3.2762 |
| D2.2-B_QTY | 3.2625 |
| BOAT_COND | 3.2572 |
| H1:4Ps | 3.2554 |
| D2/B:YC_APP | 3.2539 |
| B7:EDUCATION | 3.2443 |
| D2.4:B_AC | 3.2435 |
| D3/A:YC_GAD | 3.2380 |
| D1.7:B_OTHERS | 3.2241 |
| J7.4 | 3.1988 |
| I3:NOP_H | 3.1923 |
| D2.8:B_E-HEATER | 3.1893 |
| RESPONSE | 3.1747 |
| D1.4:B_CAR | 3.1730 |
| D1.7-B_QTY | 3.1718 |
| F1:A_HOUSE-OWN | 3.1361 |
| Y_BOAT-RE | 3.1098 |
| I6.5:OTHERS | 3.1031 |
| D2.1:A_TV | 3.0895 |
| D3.4-B_QTY | 3.0713 |
| E4:B_COOK-FUEL | 3.0459 |
| J7.1 | 3.0371 |
| D2.6:B_FRIDGE | 3.0246 |
| D2.8-A_QTY | 3.0146 |
| G3:A_PhilHealth | 3.0024 |
| G/A:Ind_INSU | 2.9828 |
| J5.5 | 2.9494 |
| J7.5 | 2.9400 |
| D2.8-B_QTY | 2.9323 |
| F3:A_HOUSE-BUILT | 2.8753 |
| D2.2:B_DVD | 2.8692 |
| D1/A:YC_VEH | 2.8685 |
| J6.6 | 2.8683 |
| F3:B_HOUSE-BUILT | 2.8509 |
| D2.2:A_DVD | 2.8442 |
| F2:A_HOUSE-ACQ | 2.8280 |
| F2:B_HOUSE-ACQ | 2.8086 |
| J5.6 | 2.8036 |
| D/B:AVE_Ind_PA | 2.7986 |
| J6.2 | 2.7774 |
| J5.2 | 2.7759 |
| D2/B:AC_APP | 2.7751 |
| J7:AVE_FBP-CONT | 2.7746 |
| I7.5:OTHERS | 2.7525 |
| I7.1:LIFE_J | 2.7430 |
| D1/B:YC_VEH | 2.7145 |
| F4:B_OTHER-RP | 2.6988 |
| D2/A:AC_APP | 2.6470 |
| D/A:AVE_Ind_PA | 2.6096 |
| E3:A_POWER-SUP | 2.5863 |
| J6.1 | 2.5729 |
| D2.5:B_E-FAN | 2.5700 |
| J5.4 | 2.5405 |
| J7.3 | 2.5182 |
| J5.7 | 2.4582 |
| G1:B_SSS | 2.4189 |
| I8.1:BAD_W | 2.3854 |
| F1:B_HOUSE-OWN | 2.3597 |
| J6:AVE_FBP-IMPT | 2.3348 |
| I8.2:FISH_R | 2.3142 |
| I1:FD_Y | 2.2985 |
| I7.4:CP | 2.2456 |
| J6.4 | 2.2419 |
| J6:AVE_FBP-PERC | 2.2241 |
| F4:A_OTHER-RP | 2.2120 |
| I7.3:F_LIGHT | 2.1926 |
| B3:AGE | 2.1627 |
| D2.5:A_E-FAN | 2.1615 |
| A: SES_INDEX | 2.1443 |
| J6.3 | 2.1325 |
| I6.3:PPN | 2.1288 |
| J1:BOAT_AGREE | 2.1277 |
| G1:A_SSS | 2.1230 |
| G/B:Ind_INSU | 2.1203 |
| E1:B_DRINK-H2O | 2.1044 |
| I6.4:H&L | 2.0855 |
| E3:B_POWER-SUP | 2.0800 |
| J6.5 | 2.0462 |
| D2.5-A_QTY | 2.0303 |
| F/A:Ind_REALP | 2.0281 |
| J5.1 | 1.9694 |
| I6.1:GN | 1.9538 |
| B: SES_INDEX | 1.9419 |
| G3:B_PhilHealth | 1.9105 |
| E1:A_DRINK-H2O | 1.8638 |
| E2:B_DOMESTIC-H2O | 1.8595 |
| F/B:Ind_REALP | 1.8511 |
| D1.1:B_BIKE | 1.8131 |
| E/B:Ind_LIFECON | 1.7700 |
| E/A:Ind_LIFECON | 1.7546 |
| E2:A_DOMESTIC-H2O | 1.5413 |
| D1.1-B_QTY | 1.4177 |
| B6:M-STATUS | 1.2784 |
In [146]:
# ENSEMBLE LEARNING FOR CLUSTERING
# In this process i will train 3 models
# - Entropy Weighted K-Means
# - K-Prototypes
# - Hierarchical Clustering
# and choose the best performing model for identifyin feature weights with the metrics of
# - Silhouette Score
# - Calinski-Harabasz Index
# - Davies-Bouldin Score
In [147]:
# Model configuration
K = 3
DF_CLEANED = DF.copy()
In [148]:
# Separate numeric and categorical
DF_NUM = DF_CLEANED.select_dtypes(include='number')
DF_CAT = DF_CLEANED.select_dtypes(include='object')
In [149]:
# Normalize numeric data
SCALER = MinMaxScaler()
X_NUM_SCALED = SCALER.fit_transform(DF_NUM)
In [150]:
# Label encode categorical data
LABEL_ENCODERS = {}
DF_CAT_ENCODED = DF_CAT.copy()
for COL in DF_CAT.columns:
LE = LabelEncoder()
DF_CAT_ENCODED[COL] = LE.fit_transform(DF_CAT[COL])
LABEL_ENCODERS[COL] = LE
X_CAT_ENCODED = DF_CAT_ENCODED.values
In [151]:
# Combine numeric and categorical
X_MIXED = np.concatenate([X_NUM_SCALED, X_CAT_ENCODED], axis=1)
In [152]:
# Created a list that holds all the metrics
METRICS_SUMMARY_2 = []
def evaluate_model(X, LABELS, NAME):
SILHOUETTE = silhouette_score(X, LABELS)
CALINSKI = calinski_harabasz_score(X, LABELS)
DAVIES = davies_bouldin_score(X, LABELS)
METRICS_SUMMARY_2.append({
'Model': NAME,
'Silhouette': SILHOUETTE,
'Calinski-Harabasz': CALINSKI,
'Davies-Bouldin': DAVIES
})
RESULTS_DF = pd.DataFrame([{
'Model': NAME,
'Silhouette': f"{SILHOUETTE:.4f}",
'Calinski-Harabasz': f"{CALINSKI:.4f}",
'Davies-Bouldin': f"{DAVIES:.4f}"
}]).set_index('Model')
print(f"\n{NAME} Clustering Evaluation:")
display(RESULTS_DF.style
.set_properties(**{'text-align': 'center'})
.format(precision=4))
In [153]:
# Entropy Weigthed K-Means
def entropy_weights(X):
X_NORM = X / (X.sum(axis=0) + 1e-9)
X_NORM = np.where(X_NORM == 0, 1e-9, X_NORM)
ENTROPY = -np.sum(X_NORM * np.log(X_NORM), axis=0) / np.log(len(X))
D = 1 - ENTROPY
WEIGHTS = D / D.sum()
return WEIGHTS
WEIGHTS = entropy_weights(X_NUM_SCALED)
X_WEIGHTED = X_NUM_SCALED * WEIGHTS
KMEANS_WEIGHTED = KMeans(n_clusters=K, random_state=42, n_init=10)
LABELS_WKMEANS = KMEANS_WEIGHTED.fit_predict(X_WEIGHTED)
evaluate_model(X_WEIGHTED, LABELS_WKMEANS, "Weighted KMeans")
Weighted KMeans Clustering Evaluation:
| Silhouette | Calinski-Harabasz | Davies-Bouldin | |
|---|---|---|---|
| Model | |||
| Weighted KMeans | 0.1686 | 86.6478 | 3.0035 |
In [154]:
# K-Prototypes
KPROTO = KPrototypes(n_clusters=K, init='Cao', n_init=5, verbose=0)
LABELS_KPROTO = KPROTO.fit_predict(X_MIXED, categorical=list(range(X_NUM_SCALED.shape[1], X_MIXED.shape[1])))
evaluate_model(X_MIXED, LABELS_KPROTO, "K-Prototypes")
K-Prototypes Clustering Evaluation:
| Silhouette | Calinski-Harabasz | Davies-Bouldin | |
|---|---|---|---|
| Model | |||
| K-Prototypes | 0.0127 | 36.5386 | 4.7558 |
In [155]:
# Hierarchical Clustering
HIERARCHICAL = AgglomerativeClustering(n_clusters=K, linkage='ward')
LABELS_HIER = HIERARCHICAL.fit_predict(X_NUM_SCALED)
evaluate_model(X_NUM_SCALED, LABELS_HIER, "Hierarchical Clustering")
Hierarchical Clustering Clustering Evaluation:
| Silhouette | Calinski-Harabasz | Davies-Bouldin | |
|---|---|---|---|
| Model | |||
| Hierarchical Clustering | 0.0782 | 69.2062 | 3.0031 |
In [156]:
METRICS_DF2 = pd.DataFrame(METRICS_SUMMARY_2)
SCORING_DF2 = METRICS_DF2.copy()
SCORING_DF2['Inverse Davies-Bouldin'] = 1 / (SCORING_DF2['Davies-Bouldin'] + 1e-6)
SCALER = MinMaxScaler()
SCALED = SCALER.fit_transform(SCORING_DF2[['Silhouette', 'Calinski-Harabasz', 'Inverse Davies-Bouldin']])
SCORING_DF2['Composite Score'] = SCALED.mean(axis=1)
In [157]:
BEST_MODEL2 = SCORING_DF2.loc[SCORING_DF2['Composite Score'].idxmax()]
print("\nBest Performing Clustering Model:")
display(pd.DataFrame({
'': ['Model', 'Composite Score'],
'Best Performance': [BEST_MODEL2['Model'], f"{BEST_MODEL2['Composite Score']:.4f}"]
}).style.hide(axis='index'))
Best Performing Clustering Model:
| Best Performance | |
|---|---|
| Model | Weighted KMeans |
| Composite Score | 0.9999 |
In [158]:
FINAL_LABELS = None
if BEST_MODEL2['Model'] == "Weighted KMeans":
FINAL_MODEL = KMeans(n_clusters=K, random_state=42, n_init=10)
FINAL_LABELS = FINAL_MODEL.fit_predict(X_WEIGHTED)
elif BEST_MODEL2['Model'] == "K-Prototypes":
FINAL_MODEL = KPrototypes(n_clusters=K, init='Cao', n_init=5, verbose=0)
FINAL_LABELS = FINAL_MODEL.fit_predict(X_MIXED, categorical=list(range(X_NUM_SCALED.shape[1], X_MIXED.shape[1])))
elif BEST_MODEL2['Model'] == "Hierarchical Clustering":
FINAL_MODEL = AgglomerativeClustering(n_clusters=K, linkage='ward')
FINAL_LABELS = FINAL_MODEL.fit_predict(X_NUM_SCALED)
DF['final_cluster'] = FINAL_LABELS
C:\Users\User\AppData\Local\Temp\ipykernel_9152\1974701490.py:15: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` DF['final_cluster'] = FINAL_LABELS
In [159]:
if BEST_MODEL2['Model'] in ["Weighted KMeans", "K-Prototypes", "Hierarchical Clustering"]:
from sklearn.decomposition import PCA
PCA_VIS = PCA(n_components=2)
X_VIS = PCA_VIS.fit_transform(X_WEIGHTED if BEST_MODEL2['Model'] == "Weighted KMeans" else X_NUM_SCALED)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_VIS[:, 0], y=X_VIS[:, 1], hue=FINAL_LABELS, palette="Set2", s=60)
plt.title(f"Clusters by {BEST_MODEL2['Model']}")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()
In [160]:
print("\nPer Feature Cluster Breakdown:\n")
for FEATURE in DF.columns:
if FEATURE == 'final_cluster':
continue
print(f"--- Feature: {FEATURE} ---")
if DF[FEATURE].dtype in ['int64', 'float64']:
FEATURE_SUMMARY = DF.groupby('final_cluster')[FEATURE].agg(['mean', 'std', 'min', 'max', 'count'])
else:
FEATURE_SUMMARY = DF.groupby('final_cluster')[FEATURE].value_counts(normalize=True).unstack(fill_value=0)
print(FEATURE_SUMMARY)
print("\n")
Per Feature Cluster Breakdown:
--- Feature: RESPONSE ---
mean std min max count
final_cluster
0 610.244648 348.353694 28 1319 327
1 436.244898 445.278964 1 1269 49
2 702.184839 389.759888 3 1339 963
--- Feature: A1:AREA ---
A1:AREA abulug alubijid aparri bolinao bugasong buguey \
final_cluster
0 0.045872 0.018349 0.055046 0.045872 0.070336 0.076453
1 0.387755 0.000000 0.040816 0.061224 0.061224 0.000000
2 0.029076 0.033229 0.056075 0.019730 0.069574 0.057113
A1:AREA gitagum gumaca hamtic itogon ... manito \
final_cluster ...
0 0.027523 0.024465 0.061162 0.061162 ... 0.003058
1 0.000000 0.020408 0.000000 0.020408 ... 0.000000
2 0.019730 0.020768 0.067497 0.024922 ... 0.028037
A1:AREA morong mulanay pandan san jose san juan sarangani \
final_cluster
0 0.070336 0.021407 0.058104 0.030581 0.036697 0.003058
1 0.061224 0.000000 0.040816 0.061224 0.020408 0.000000
2 0.021807 0.045691 0.061267 0.065421 0.017653 0.053998
A1:AREA sta. ana talisay ternate
final_cluster
0 0.058104 0.009174 0.036697
1 0.061224 0.000000 0.040816
2 0.040498 0.079958 0.031153
[3 rows x 23 columns]
--- Feature: A2:GROUP ---
mean std min max count
final_cluster
0 0.403670 0.491385 0 1 327
1 0.551020 0.502545 0 1 49
2 0.456906 0.498398 0 1 963
--- Feature: B3:AGE ---
mean std min max count
final_cluster
0 48.691131 12.557329 19 79 327
1 49.693878 14.695810 20 75 49
2 48.350987 12.710407 18 87 963
--- Feature: B5:SEX ---
mean std min max count
final_cluster
0 1.149847 0.357468 1 2 327
1 1.183673 0.391230 1 2 49
2 1.088266 0.283829 1 2 963
--- Feature: B6:M-STATUS ---
mean std min max count
final_cluster
0 1.987768 0.631366 1 4 327
1 2.000000 0.677003 1 4 49
2 2.066459 0.647839 1 4 963
--- Feature: B7:EDUCATION ---
mean std min max count
final_cluster
0 2.926606 0.872621 1 5 327
1 2.877551 0.753676 2 5 49
2 2.600208 0.699777 1 5 963
--- Feature: B8:HH_SIZE ---
mean std min max count
final_cluster
0 1.966361 0.744365 1 4 327
1 1.897959 0.822722 1 4 49
2 1.936656 0.731768 1 4 963
--- Feature: C1:TOT_INCOME/A ---
mean std min max count
final_cluster
0 1.247706 0.652827 1 6 327
1 1.367347 0.950743 1 5 49
2 1.130841 0.471114 1 6 963
--- Feature: C2:INCOME/B/FISH ---
mean std min max count
final_cluster
0 1.266055 0.601259 1 6 327
1 1.448980 0.980247 1 6 49
2 1.226376 0.557178 1 6 963
--- Feature: C4:INCOME/B/ALT ---
mean std min max count
final_cluster
0 1.269113 0.607514 1 6 327
1 1.469388 0.892143 1 5 49
2 1.125649 0.448834 1 5 963
--- Feature: C5:TOT_INCOME/B ---
mean std min max count
final_cluster
0 2.510703 0.853920 2 6 327
1 2.857143 1.224745 2 6 49
2 2.345794 0.728004 2 6 963
--- Feature: D1.1:A_BIKE ---
mean std min max count
final_cluster
0 0.281346 0.450345 0 1 327
1 0.224490 0.421570 0 1 49
2 0.183801 0.387523 0 1 963
--- Feature: D1.1-A_QTY ---
mean std min max count
final_cluster
0 0.281346 0.450345 0 1 327
1 0.224490 0.421570 0 1 49
2 0.184839 0.391036 0 2 963
--- Feature: D1.2:A_MOTORC ---
mean std min max count
final_cluster
0 0.504587 0.500745 0 1 327
1 0.367347 0.487078 0 1 49
2 0.236760 0.425315 0 1 963
--- Feature: D1.2-A_QTY ---
mean std min max count
final_cluster
0 0.504587 0.500745 0 1 327
1 0.367347 0.487078 0 1 49
2 0.236760 0.425315 0 1 963
--- Feature: D1.3:A_TRICYCLE ---
mean std min max count
final_cluster
0 0.198777 0.399691 0 1 327
1 0.142857 0.353553 0 1 49
2 0.026999 0.162165 0 1 963
--- Feature: D1.3-A_QTY ---
mean std min max count
final_cluster
0 0.198777 0.399691 0 1 327
1 0.142857 0.353553 0 1 49
2 0.026999 0.162165 0 1 963
--- Feature: D1.4:A_CAR ---
mean std min max count
final_cluster
0 0.024465 0.154724 0 1 327
1 0.102041 0.305839 0 1 49
2 0.000000 0.000000 0 0 963
--- Feature: D1.4-A_QTY ---
mean std min max count
final_cluster
0 0.024465 0.154724 0 1 327
1 0.102041 0.305839 0 1 49
2 0.000000 0.000000 0 0 963
--- Feature: D1.5:A_JEEP ---
mean std min max count
final_cluster
0 0.012232 0.110090 0 1 327
1 0.061224 0.242226 0 1 49
2 0.001038 0.032225 0 1 963
--- Feature: D1.5-A_QTY ---
mean std min max count
final_cluster
0 0.012232 0.110090 0 1 327
1 0.081633 0.276642 0 1 49
2 0.001038 0.032225 0 1 963
--- Feature: D1.6:A_TRUCK ---
mean std min max count
final_cluster
0 0.006116 0.078086 0 1 327
1 0.081633 0.276642 0 1 49
2 0.000000 0.000000 0 0 963
--- Feature: D1.6-A_QTY ---
mean std min max count
final_cluster
0 0.006116 0.078086 0 1 327
1 0.102041 0.305839 0 1 49
2 0.000000 0.000000 0 0 963
--- Feature: D1.7:A_OTHERS ---
mean std min max count
final_cluster
0 0.009174 0.095488 0 1 327
1 0.224490 0.421570 0 1 49
2 0.013499 0.115460 0 1 963
--- Feature: D1.7-A_QTY ---
mean std min max count
final_cluster
0 0.009174 0.095488 0 1 327
1 0.224490 0.421570 0 1 49
2 0.012461 0.110989 0 1 963
--- Feature: D1/A:YC_VEH ---
mean std min max count
final_cluster
0 14.823853 12.902455 0.0 85.7 327
1 17.212245 23.146397 0.0 100.0 49
2 6.563448 8.905913 0.0 42.9 963
--- Feature: D1/A:AC_VEH ---
mean std min max count
final_cluster
0 0.147584 0.129986 0.0 0.86 327
1 0.177551 0.241648 0.0 1.00 49
2 0.064901 0.088608 0.0 0.43 963
--- Feature: D1/A:Ind_VEH ---
mean std min max count
final_cluster
0 3.849847 6.614723 0.0 73.47 327
1 8.454286 20.569155 0.0 100.00 49
2 1.215971 2.185411 0.0 18.37 963
--- Feature: D2.1:A_TV ---
mean std min max count
final_cluster
0 0.782875 0.412921 0 1 327
1 0.551020 0.502545 0 1 49
2 0.512980 0.500091 0 1 963
--- Feature: D2.1-A_QTY ---
mean std min max count
final_cluster
0 0.785933 0.410802 0 1 327
1 0.571429 0.500000 0 1 49
2 0.517134 0.502041 0 2 963
--- Feature: D2.2:A_DVD ---
mean std min max count
final_cluster
0 0.195719 0.397361 0 1 327
1 0.571429 0.500000 0 1 49
2 0.083074 0.276137 0 1 963
--- Feature: D2.2-A_QTY ---
mean std min max count
final_cluster
0 0.195719 0.397361 0 1 327
1 0.571429 0.500000 0 1 49
2 0.084112 0.281419 0 2 963
--- Feature: D2.3:A_WASH-M ---
mean std min max count
final_cluster
0 0.703364 0.457475 0 1 327
1 0.489796 0.505076 0 1 49
2 0.145379 0.352666 0 1 963
--- Feature: D2.3-A_QTY ---
mean std min max count
final_cluster
0 0.703364 0.457475 0 1 327
1 0.489796 0.505076 0 1 49
2 0.146417 0.356635 0 2 963
--- Feature: D2.4:A_AC ---
mean std min max count
final_cluster
0 0.061162 0.239995 0 1 327
1 0.714286 0.456435 0 1 49
2 0.015576 0.123894 0 1 963
--- Feature: D2.4-A_QTY ---
mean std min max count
final_cluster
0 0.073394 0.261183 0 1 327
1 0.714286 0.456435 0 1 49
2 0.019730 0.139143 0 1 963
--- Feature: D2.5:A_E-FAN ---
mean std min max count
final_cluster
0 0.892966 0.309630 0 1 327
1 0.571429 0.500000 0 1 49
2 0.650052 0.477201 0 1 963
--- Feature: D2.5-A_QTY ---
mean std min max count
final_cluster
0 0.899083 0.321374 0 2 327
1 0.571429 0.500000 0 1 49
2 0.653167 0.482713 0 2 963
--- Feature: D2.6:A_FRIDGE ---
mean std min max count
final_cluster
0 0.660550 0.474248 0 1 327
1 0.571429 0.500000 0 1 49
2 0.071651 0.258043 0 1 963
--- Feature: D2.6-A_QTY ---
mean std min max count
final_cluster
0 0.660550 0.474248 0 1 327
1 0.571429 0.500000 0 1 49
2 0.071651 0.258043 0 1 963
--- Feature: D2.7:A_STOVE ---
mean std min max count
final_cluster
0 0.642202 0.480087 0 1 327
1 0.551020 0.502545 0 1 49
2 0.106957 0.309220 0 1 963
--- Feature: D2.7-A_QTY ---
mean std min max count
final_cluster
0 0.642202 0.480087 0 1 327
1 0.551020 0.502545 0 1 49
2 0.106957 0.309220 0 1 963
--- Feature: D2.8:A_E-HEATER ---
mean std min max count
final_cluster
0 0.195719 0.397361 0 1 327
1 0.469388 0.504234 0 1 49
2 0.021807 0.146128 0 1 963
--- Feature: D2.8-A_QTY ---
mean std min max count
final_cluster
0 0.214067 0.410802 0 1 327
1 0.469388 0.504234 0 1 49
2 0.021807 0.146128 0 1 963
--- Feature: D2.9:A_FURNITURE ---
mean std min max count
final_cluster
0 0.379205 0.485933 0 1 327
1 0.346939 0.480929 0 1 49
2 0.028037 0.165165 0 1 963
--- Feature: D2.9-A_QTY ---
mean std min max count
final_cluster
0 0.379205 0.485933 0 1 327
1 0.346939 0.480929 0 1 49
2 0.028037 0.165165 0 1 963
--- Feature: D2.10:A_OTHERS ---
mean std min max count
final_cluster
0 0.003058 0.055300 0 1 327
1 0.591837 0.496587 0 1 49
2 0.008307 0.090813 0 1 963
--- Feature: D2.10-A_QTY ---
mean std min max count
final_cluster
0 0.024465 0.154724 0 1 327
1 0.591837 0.496587 0 1 49
2 0.006231 0.078728 0 1 963
--- Feature: D2/A:YC_APP ---
mean std min max count
final_cluster
0 45.168196 15.952936 0.0 80.0 327
1 54.285714 28.795254 0.0 100.0 49
2 16.448598 11.465133 0.0 50.0 963
--- Feature: D2/A:AC_APP ---
mean std min max count
final_cluster
0 0.457798 0.170326 0.0 1.0 327
1 0.544898 0.290159 0.0 1.0 49
2 0.165628 0.117370 0.0 0.8 963
--- Feature: D2/A:Ind_APP ---
mean std min max count
final_cluster
0 23.321101 14.688513 0.0 70.0 327
1 37.755102 31.963215 0.0 100.0 49
2 4.053998 4.647132 0.0 32.0 963
--- Feature: D3.1:A_CP ---
mean std min max count
final_cluster
0 0.856269 0.351354 0 1 327
1 0.489796 0.505076 0 1 49
2 0.636552 0.481242 0 1 963
--- Feature: D3.1-A_QTY ---
mean std min max count
final_cluster
0 0.856269 0.351354 0 1 327
1 0.489796 0.505076 0 1 49
2 0.636552 0.481242 0 1 963
--- Feature: D3.2:A_LANDLINE ---
mean std min max count
final_cluster
0 0.012232 0.110090 0 1 327
1 0.122449 0.331201 0 1 49
2 0.001038 0.032225 0 1 963
--- Feature: D3.2-A_QTY ---
mean std min max count
final_cluster
0 0.012232 0.110090 0 1 327
1 0.142857 0.408248 0 2 49
2 0.001038 0.032225 0 1 963
--- Feature: D3.3:A_COMPUTER ---
mean std min max count
final_cluster
0 0.070336 0.256105 0 1 327
1 0.244898 0.434483 0 1 49
2 0.006231 0.078728 0 1 963
--- Feature: D3.3-A_QTY ---
mean std min max count
final_cluster
0 0.067278 0.250887 0 1 327
1 0.265306 0.490557 0 2 49
2 0.006231 0.078728 0 1 963
--- Feature: D3.4:A_OTHERS ---
mean std min max count
final_cluster
0 0.015291 0.122894 0 1 327
1 0.489796 0.505076 0 1 49
2 0.007269 0.084992 0 1 963
--- Feature: D3.4-A_QTY ---
mean std min max count
final_cluster
0 0.018349 0.134414 0 1 327
1 0.510204 0.544765 0 2 49
2 0.007269 0.084992 0 1 963
--- Feature: D3/A:YC_GAD ---
mean std min max count
final_cluster
0 23.853211 11.936046 0.0 75.0 327
1 33.673469 24.233575 0.0 100.0 49
2 16.277259 12.402592 0.0 75.0 963
--- Feature: D3/A:AC_GAD ---
mean std min max count
final_cluster
0 0.238532 0.120956 0.0 0.75 327
1 0.352041 0.301551 0.0 1.75 49
2 0.162773 0.124026 0.0 0.75 963
--- Feature: D3/A:IndGAD ---
mean std min max count
final_cluster
0 7.110092 6.501896 0.0 56.25 327
1 18.622449 30.605058 0.0 175.00 49
2 4.186137 3.807333 0.0 56.25 963
--- Feature: D/A:AVE_Ind_PA ---
mean std min max count
final_cluster
0 27.946177 9.241908 0.0 55.5 327
1 35.061224 18.025073 6.7 69.3 49
2 13.024922 7.309862 0.0 44.5 963
--- Feature: E1:A_DRINK-H2O ---
mean std min max count
final_cluster
0 2.883792 1.090371 1 4 327
1 2.693878 1.261680 1 4 49
2 2.679128 1.065354 1 4 963
--- Feature: E2:A_DOMESTIC-H2O ---
mean std min max count
final_cluster
0 2.617737 1.302684 1 4 327
1 2.306122 1.261680 1 4 49
2 2.586708 1.255967 1 4 963
--- Feature: E3:A_POWER-SUP ---
mean std min max count
final_cluster
0 2.633028 0.717792 0 3 327
1 2.693878 0.741734 0 3 49
2 2.329180 1.006553 0 3 963
--- Feature: E4:A_COOK-FUEL ---
mean std min max count
final_cluster
0 2.740061 0.898030 1 4 327
1 2.836735 0.943110 2 4 49
2 2.191070 0.687612 1 4 963
--- Feature: E5:A_NET-SUBS ---
mean std min max count
final_cluster
0 0.140673 0.348217 0 1 327
1 0.346939 0.480929 0 1 49
2 0.043614 0.204340 0 1 963
--- Feature: E/A:Ind_LIFECON ---
mean std min max count
final_cluster
0 61.621101 13.801021 33.3 100.0 327
1 64.079592 17.416898 38.3 100.0 49
2 54.565836 13.123626 15.0 100.0 963
--- Feature: F1:A_HOUSE-OWN ---
mean std min max count
final_cluster
0 2.688073 0.683243 1 3 327
1 2.693878 0.683255 1 3 49
2 2.425753 0.832950 1 3 963
--- Feature: F2:A_HOUSE-ACQ ---
mean std min max count
final_cluster
0 2.665138 1.229223 0.0 3.5 327
1 2.775510 1.270913 0.0 3.5 49
2 2.233645 1.418958 0.0 4.0 963
--- Feature: F3:A_HOUSE-BUILT ---
mean std min max count
final_cluster
0 2.321101 0.771987 1.0 4.0 327
1 2.500000 0.866025 1.0 4.0 49
2 1.841121 0.903730 1.0 4.0 963
--- Feature: F4:A_OTHER-RP ---
mean std min max count
final_cluster
0 2.605505 1.317194 0.0 4.0 327
1 2.775510 1.380901 0.0 4.0 49
2 2.287643 1.452943 0.0 4.0 963
--- Feature: F/A:Ind_REALP ---
mean std min max count
final_cluster
0 72.232416 19.785450 14.6 100.0 327
1 75.253061 20.713050 20.8 100.0 49
2 61.974247 21.213947 14.6 100.0 963
--- Feature: G1:A_SSS ---
mean std min max count
final_cluster
0 0.339450 0.474248 0 1 327
1 0.244898 0.434483 0 1 49
2 0.168224 0.374260 0 1 963
--- Feature: G2:A_GSIS ---
mean std min max count
final_cluster
0 0.027523 0.163852 0 1 327
1 0.326531 0.473804 0 1 49
2 0.018692 0.135504 0 1 963
--- Feature: G3:A_PhilHealth ---
mean std min max count
final_cluster
0 0.519878 0.500370 0 1 327
1 0.387755 0.492287 0 1 49
2 0.473520 0.499558 0 1 963
--- Feature: G4:A_PN-IN ---
mean std min max count
final_cluster
0 0.067278 0.250887 0 1 327
1 0.102041 0.305839 0 1 49
2 0.051921 0.221983 0 1 963
--- Feature: G5:A_LIFE-IN ---
mean std min max count
final_cluster
0 0.076453 0.266128 0 1 327
1 0.102041 0.305839 0 1 49
2 0.036345 0.187244 0 1 963
--- Feature: G6:A_HEALTH-IN ---
mean std min max count
final_cluster
0 0.082569 0.275651 0 1 327
1 0.102041 0.305839 0 1 49
2 0.023884 0.152766 0 1 963
--- Feature: G/A:Ind_INSU ---
mean std min max count
final_cluster
0 22.262997 22.231726 0.0 120.0 327
1 25.306122 27.010958 0.0 120.0 49
2 15.171340 16.896614 0.0 100.0 963
--- Feature: A: SES_INDEX ---
mean std min max count
final_cluster
0 46.197554 9.209208 0.0 73.8 327
1 50.406122 13.180453 23.1 76.8 49
2 34.109761 11.368133 0.0 61.6 963
--- Feature: D1.1:B_BIKE ---
mean std min max count
final_cluster
0 0.256881 0.437583 0 1 327
1 0.408163 0.496587 0 1 49
2 0.222222 0.415956 0 1 963
--- Feature: D1.1-B_QTY ---
mean std min max count
final_cluster
0 0.256881 0.437583 0 1 327
1 0.408163 0.496587 0 1 49
2 0.222222 0.415956 0 1 963
--- Feature: D1.2:B_MOTORC ---
mean std min max count
final_cluster
0 0.608563 0.488820 0 1 327
1 0.367347 0.487078 0 1 49
2 0.340602 0.474158 0 1 963
--- Feature: D1.2-B_QTY ---
mean std min max count
final_cluster
0 0.608563 0.488820 0 1 327
1 0.367347 0.487078 0 1 49
2 0.340602 0.474158 0 1 963
--- Feature: D1.3:B_TRICYCLE ---
mean std min max count
final_cluster
0 0.211009 0.408650 0 1 327
1 0.102041 0.305839 0 1 49
2 0.044652 0.206646 0 1 963
--- Feature: D1.3-B_QTY ---
mean std min max count
final_cluster
0 0.211009 0.408650 0 1 327
1 0.102041 0.305839 0 1 49
2 0.043614 0.204340 0 1 963
--- Feature: D1.4:B_CAR ---
mean std min max count
final_cluster
0 0.021407 0.144958 0 1 327
1 0.122449 0.331201 0 1 49
2 0.001038 0.032225 0 1 963
--- Feature: D1.4-B_QTY ---
mean std min max count
final_cluster
0 0.024465 0.154724 0 1 327
1 0.122449 0.331201 0 1 49
2 0.002077 0.045549 0 1 963
--- Feature: D1.5:B_JEEP ---
mean std min max count
final_cluster
0 0.012232 0.110090 0 1 327
1 0.061224 0.242226 0 1 49
2 0.001038 0.032225 0 1 963
--- Feature: D1.5-B_QTY ---
mean std min max count
final_cluster
0 0.015291 0.122894 0 1 327
1 0.061224 0.242226 0 1 49
2 0.002077 0.045549 0 1 963
--- Feature: D1.6:B_TRUCK ---
mean std min max count
final_cluster
0 0.000000 0.000000 0 0 327
1 0.081633 0.276642 0 1 49
2 0.000000 0.000000 0 0 963
--- Feature: D1.6-B_QTY ---
mean std min max count
final_cluster
0 0.006116 0.078086 0 1 327
1 0.081633 0.276642 0 1 49
2 0.001038 0.032225 0 1 963
--- Feature: D1.7:B_OTHERS ---
mean std min max count
final_cluster
0 0.006116 0.078086 0 1 327
1 0.448980 0.502545 0 1 49
2 0.021807 0.146128 0 1 963
--- Feature: D1.7-B_QTY ---
mean std min max count
final_cluster
0 0.009174 0.095488 0 1 327
1 0.448980 0.502545 0 1 49
2 0.021807 0.146128 0 1 963
--- Feature: D1/B:YC_VEH ---
mean std min max count
final_cluster
0 15.961468 11.464752 0.0 57.1 327
1 22.755102 21.212163 0.0 100.0 49
2 8.983904 9.898226 0.0 42.9 963
--- Feature: D1/B:AC_VEH ---
mean std min max count
final_cluster
0 0.161162 0.121116 0.0 0.71 327
1 0.227347 0.212953 0.0 1.00 49
2 0.089346 0.101022 0.0 0.71 963
--- Feature: D1/B:Ind_VEH ---
mean std min max count
final_cluster
0 3.899633 4.753102 0.0 32.65 327
1 9.578163 20.208343 0.0 100.00 49
2 1.792222 2.800811 0.0 20.41 963
--- Feature: D2.1:B_TV ---
mean std min max count
final_cluster
0 0.828746 0.377308 0 1 327
1 0.591837 0.496587 0 1 49
2 0.611630 0.487633 0 1 963
--- Feature: D2.1-B_QTY ---
mean std min max count
final_cluster
0 0.828746 0.377308 0 1 327
1 0.591837 0.496587 0 1 49
2 0.611630 0.487633 0 1 963
--- Feature: D2.2:B_DVD ---
mean std min max count
final_cluster
0 0.192661 0.394993 0 1 327
1 0.489796 0.505076 0 1 49
2 0.082035 0.274561 0 1 963
--- Feature: D2.2-B_QTY ---
mean std min max count
final_cluster
0 0.192661 0.394993 0 1 327
1 0.510204 0.505076 0 1 49
2 0.083074 0.276137 0 1 963
--- Feature: D2.3:B_WASH-M ---
mean std min max count
final_cluster
0 0.767584 0.423020 0 1 327
1 0.387755 0.492287 0 1 49
2 0.241952 0.428488 0 1 963
--- Feature: D2.3-B_QTY ---
mean std min max count
final_cluster
0 0.767584 0.423020 0 1 327
1 0.387755 0.492287 0 1 49
2 0.241952 0.428488 0 1 963
--- Feature: D2.4:B_AC ---
mean std min max count
final_cluster
0 0.085627 0.280241 0 1 327
1 0.775510 0.421570 0 1 49
2 0.024922 0.155969 0 1 963
--- Feature: D2.4-B_QTY ---
mean std min max count
final_cluster
0 0.085627 0.280241 0 1 327
1 0.775510 0.421570 0 1 49
2 0.024922 0.155969 0 1 963
--- Feature: D2.5:B_E-FAN ---
mean std min max count
final_cluster
0 0.920489 0.270949 0 1 327
1 0.673469 0.473804 0 1 49
2 0.792316 0.405860 0 1 963
--- Feature: D2.5-B_QTY ---
mean std min max count
final_cluster
0 0.935780 0.311593 0 3 327
1 0.693878 0.508432 0 2 49
2 0.802700 0.421009 0 2 963
--- Feature: D2.6:B_FRIDGE ---
mean std min max count
final_cluster
0 0.758410 0.428703 0 1 327
1 0.591837 0.496587 0 1 49
2 0.128764 0.335113 0 1 963
--- Feature: D2.6-B_QTY ---
mean std min max count
final_cluster
0 0.758410 0.428703 0 1 327
1 0.591837 0.496587 0 1 49
2 0.128764 0.335113 0 1 963
--- Feature: D2.7:B_STOVE ---
mean std min max count
final_cluster
0 0.740061 0.439273 0 1 327
1 0.571429 0.500000 0 1 49
2 0.149533 0.356798 0 1 963
--- Feature: D2.7-B_QTY ---
mean std min max count
final_cluster
0 0.740061 0.439273 0 1 327
1 0.571429 0.500000 0 1 49
2 0.149533 0.356798 0 1 963
--- Feature: D2.8:B_E-HEATER ---
mean std min max count
final_cluster
0 0.232416 0.423020 0 1 327
1 0.469388 0.504234 0 1 49
2 0.035306 0.184649 0 1 963
--- Feature: D2.8-B_QTY ---
mean std min max count
final_cluster
0 0.232416 0.423020 0 1 327
1 0.469388 0.504234 0 1 49
2 0.035306 0.184649 0 1 963
--- Feature: D2.9:B_FURNITURE ---
mean std min max count
final_cluster
0 0.452599 0.498511 0 1 327
1 0.346939 0.480929 0 1 49
2 0.039460 0.194788 0 1 963
--- Feature: D2.9-B_QTY ---
mean std min max count
final_cluster
0 0.452599 0.498511 0 1 327
1 0.346939 0.480929 0 1 49
2 0.038422 0.192312 0 1 963
--- Feature: D2.10:B_OTHERS ---
mean std min max count
final_cluster
0 0.012232 0.110090 0 1 327
1 0.428571 0.500000 0 1 49
2 0.006231 0.078728 0 1 963
--- Feature: D2.10-B_QTY ---
mean std min max count
final_cluster
0 0.012232 0.110090 0 1 327
1 0.428571 0.500000 0 1 49
2 0.006231 0.078728 0 1 963
--- Feature: D2/B:YC_APP ---
mean std min max count
final_cluster
0 49.908257 13.532198 10.0 90.0 327
1 53.265306 25.689472 10.0 100.0 49
2 21.121495 11.854488 0.0 60.0 963
--- Feature: D2/B:AC_APP ---
mean std min max count
final_cluster
0 0.500612 0.136340 0.1 1.0 327
1 0.536735 0.259562 0.1 1.0 49
2 0.212253 0.119229 0.0 0.6 963
--- Feature: D2/B:Ind_APP ---
mean std min max count
final_cluster
0 26.813456 14.382550 1.0 90.0 327
1 35.102041 28.426253 1.0 100.0 49
2 5.888889 5.791519 0.0 36.0 963
--- Feature: D3.1:B_CP ---
mean std min max count
final_cluster
0 0.935780 0.245520 0 1 327
1 0.530612 0.504234 0 1 49
2 0.778816 0.415260 0 1 963
--- Feature: D3.1-B_QTY ---
mean std min max count
final_cluster
0 0.935780 0.245520 0 1 327
1 0.530612 0.504234 0 1 49
2 0.778816 0.415260 0 1 963
--- Feature: D3.2:B_LANDLINE ---
mean std min max count
final_cluster
0 0.018349 0.134414 0 1 327
1 0.224490 0.421570 0 1 49
2 0.002077 0.045549 0 1 963
--- Feature: D3.2-B_QTY ---
mean std min max count
final_cluster
0 0.018349 0.134414 0 1 327
1 0.224490 0.468388 0 2 49
2 0.002077 0.045549 0 1 963
--- Feature: D3.3:B_COMPUTER ---
mean std min max count
final_cluster
0 0.116208 0.320965 0 1 327
1 0.224490 0.421570 0 1 49
2 0.013499 0.115460 0 1 963
--- Feature: D3.3-B_QTY ---
mean std min max count
final_cluster
0 0.113150 0.317261 0 1 327
1 0.244898 0.480044 0 2 49
2 0.013499 0.115460 0 1 963
--- Feature: D3.4:B_OTHERS ---
mean std min max count
final_cluster
0 0.012232 0.110090 0 1 327
1 0.285714 0.456435 0 1 49
2 0.005192 0.071906 0 1 963
--- Feature: D3.4-B_QTY ---
mean std min max count
final_cluster
0 0.012232 0.110090 0 1 327
1 0.306122 0.508432 0 2 49
2 0.005192 0.071906 0 1 963
--- Feature: D3/B:YC_GAD ---
mean std min max count
final_cluster
0 27.064220 11.482402 0.0 75.0 327
1 31.632653 24.882804 0.0 100.0 49
2 19.989616 11.119737 0.0 75.0 963
--- Feature: D3/B:AC_GAD ---
mean std min max count
final_cluster
0 0.269878 0.114122 0.0 0.75 327
1 0.326531 0.307053 0.0 1.75 49
2 0.199896 0.111197 0.0 0.75 963
--- Feature: D3/B:IndGAD ---
mean std min max count
final_cluster
0 8.600917 7.868731 0.0 56.25 327
1 17.346939 29.817842 0.0 175.00 49
2 5.231049 4.126755 0.0 56.25 963
--- Feature: D/B:AVE_Ind_PA ---
mean std min max count
final_cluster
0 30.972477 7.478005 10.0 57.86 327
1 35.880000 16.543813 8.1 75.00 49
2 16.578006 7.145593 0.0 35.95 963
--- Feature: E1:B_DRINK-H2O ---
mean std min max count
final_cluster
0 3.143731 1.065702 1 4 327
1 2.734694 1.237866 1 4 49
2 2.845275 1.083780 1 4 963
--- Feature: E2:B_DOMESTIC-H2O ---
mean std min max count
final_cluster
0 2.764526 1.297554 1 4 327
1 2.224490 1.262691 1 4 49
2 2.647975 1.279806 1 4 963
--- Feature: E3:B_POWER-SUP ---
mean std min max count
final_cluster
0 2.798165 0.642748 0 3 327
1 2.755102 0.778102 0 3 49
2 2.623053 0.828399 0 3 963
--- Feature: E4:B_COOK-FUEL ---
mean std min max count
final_cluster
0 3.113150 0.947735 1 4 327
1 2.836735 0.920755 2 4 49
2 2.273105 0.816395 1 4 963
--- Feature: E5:B_NET-SUBS ---
mean std min max count
final_cluster
0 0.201835 0.401985 0 1 327
1 0.346939 0.480929 0 1 49
2 0.062305 0.241835 0 1 963
--- Feature: E/B:Ind_LIFECON ---
mean std min max count
final_cluster
0 67.817125 15.185683 35.0 100.0 327
1 64.285714 17.816063 40.0 100.0 49
2 57.923364 13.237665 15.0 100.0 963
--- Feature: F1:B_HOUSE-OWN ---
mean std min max count
final_cluster
0 2.675841 0.690956 1 3 327
1 2.775510 0.586846 1 3 49
2 2.442368 0.825498 1 3 963
--- Feature: F2:B_HOUSE-ACQ ---
mean std min max count
final_cluster
0 2.654434 1.237144 0.0 3.5 327
1 2.795918 1.249660 0.0 3.5 49
2 2.251817 1.404139 0.0 4.0 963
--- Feature: F3:B_HOUSE-BUILT ---
mean std min max count
final_cluster
0 2.325688 0.786317 1.0 4.0 327
1 2.408163 0.833376 1.0 4.0 49
2 1.861371 0.904047 1.0 4.0 963
--- Feature: F4:B_OTHER-RP ---
mean std min max count
final_cluster
0 2.633028 1.319931 0.0 4.0 327
1 2.775510 1.380901 0.0 4.0 49
2 2.298027 1.446182 0.0 4.0 963
--- Feature: F/B:Ind_REALP ---
mean std min max count
final_cluster
0 72.253425 19.603687 14.6 100.00 327
1 75.504082 20.555869 20.8 100.00 49
2 62.363541 21.022201 14.6 105.56 963
--- Feature: G1:B_SSS ---
mean std min max count
final_cluster
0 0.333333 0.472127 0 1 327
1 0.224490 0.421570 0 1 49
2 0.145379 0.352666 0 1 963
--- Feature: G2:B_GSIS ---
mean std min max count
final_cluster
0 0.033639 0.180575 0 1 327
1 0.367347 0.487078 0 1 49
2 0.021807 0.146128 0 1 963
--- Feature: G3:B_PhilHealth ---
mean std min max count
final_cluster
0 0.547401 0.498511 0 1 327
1 0.408163 0.496587 0 1 49
2 0.495327 0.500238 0 1 963
--- Feature: G4:B_PN-IN ---
mean std min max count
final_cluster
0 0.088685 0.284724 0 1 327
1 0.142857 0.353553 0 1 49
2 0.056075 0.230186 0 1 963
--- Feature: G5:B_LIFE-IN ---
mean std min max count
final_cluster
0 0.094801 0.293389 0 1 327
1 0.102041 0.305839 0 1 49
2 0.049844 0.217736 0 1 963
--- Feature: G6:B_HEALTH-IN ---
mean std min max count
final_cluster
0 0.097859 0.297580 0 1 327
1 0.142857 0.353553 0 1 49
2 0.032191 0.176599 0 1 963
--- Feature: G/B:Ind_INSU ---
mean std min max count
final_cluster
0 23.914373 23.168468 0.0 100.0 327
1 27.755102 28.523770 0.0 120.0 49
2 15.638629 17.075174 0.0 120.0 963
--- Feature: B: SES_INDEX ---
mean std min max count
final_cluster
0 48.987156 9.521581 0.0 74.8 327
1 51.259184 12.708313 22.0 82.2 49
2 36.216511 11.715779 0.0 67.7 963
--- Feature: H1:4Ps ---
mean std min max count
final_cluster
0 0.168196 0.382714 0.0 2.0 327
1 0.091837 0.282722 0.0 1.0 49
2 0.359813 0.576817 0.0 2.0 963
--- Feature: H2:RET_P ---
mean std min max count
final_cluster
0 0.097859 0.302690 0.0 2.0 327
1 0.132653 0.335030 0.0 1.0 49
2 0.161994 0.492384 0.0 2.0 963
--- Feature: H3:SPES ---
mean std min max count
final_cluster
0 0.056575 0.239525 0.0 2.0 327
1 0.051020 0.210280 0.0 1.0 49
2 0.137591 0.474417 0.0 2.0 963
--- Feature: H4:AL_P ---
mean std min max count
final_cluster
0 0.146789 0.358739 0.0 2.0 327
1 0.132653 0.335030 0.0 1.0 49
2 0.188474 0.509313 0.0 2.0 963
--- Feature: H5:TBE ---
mean std min max count
final_cluster
0 0.120795 0.332210 0.0 2.0 327
1 0.132653 0.335030 0.0 1.0 49
2 0.167705 0.496522 0.0 2.0 963
--- Feature: H6:F_PC ---
mean std min max count
final_cluster
0 0.096330 0.299363 0.0 2.0 327
1 0.091837 0.263658 0.0 1.0 49
2 0.210280 0.523033 0.0 2.0 963
--- Feature: H7:AS_P ---
mean std min max count
final_cluster
0 0.136086 0.348943 0.0 2.0 327
1 0.275510 0.445594 0.0 1.0 49
2 0.174455 0.500199 0.0 2.0 963
--- Feature: H8:E/CW_P ---
mean std min max count
final_cluster
0 0.244648 0.434085 0.0 2.0 327
1 0.102041 0.269637 0.0 1.0 49
2 0.274143 0.550587 0.0 2.0 963
--- Feature: H:Ind_ASWS ---
mean std min max count
final_cluster
0 12.730887 16.214221 0.0 100.0 327
1 12.634694 13.598170 0.0 56.3 49
2 10.171028 13.876757 0.0 100.0 963
--- Feature: I1:FD_Y ---
mean std min max count
final_cluster
0 3.428135 0.940012 1 4 327
1 2.775510 1.104259 1 4 49
2 3.595016 0.757537 1 4 963
--- Feature: I2:A/C_M ---
mean std min max count
final_cluster
0 1.657492 0.475276 1 2 327
1 1.510204 0.505076 1 2 49
2 1.595016 0.491144 1 2 963
--- Feature: I3:NOP_H ---
mean std min max count
final_cluster
0 1.140673 1.272080 0 7 327
1 0.816327 1.093036 0 5 49
2 0.987539 1.092333 0 10 963
--- Feature: I4:TFA ---
mean std min max count
final_cluster
0 1.174312 0.432797 1 3 327
1 1.571429 0.866025 1 3 49
2 1.177570 0.442817 1 3 963
--- Feature: I5:TFV ---
mean std min max count
final_cluster
0 2.905199 0.956189 1 5 327
1 2.204082 1.098855 1 4 49
2 2.884735 0.905174 1 5 963
--- Feature: I6.1:GN ---
mean std min max count
final_cluster
0 0.577982 0.494638 0 1 327
1 0.387755 0.492287 0 1 49
2 0.504673 0.500238 0 1 963
--- Feature: I6.2:FT ---
mean std min max count
final_cluster
0 0.162080 0.369089 0 1 327
1 0.081633 0.276642 0 1 49
2 0.095535 0.294105 0 1 963
--- Feature: I6.3:PPN ---
mean std min max count
final_cluster
0 0.113150 0.317261 0 1 327
1 0.183673 0.391230 0 1 49
2 0.110073 0.313143 0 1 963
--- Feature: I6.4:H&L ---
mean std min max count
final_cluster
0 0.660550 0.474248 0 1 327
1 0.387755 0.492287 0 1 49
2 0.603323 0.489462 0 1 963
--- Feature: I6.5:OTHERS ---
mean std min max count
final_cluster
0 0.116208 0.320965 0 1 327
1 0.224490 0.421570 0 1 49
2 0.104881 0.306559 0 1 963
--- Feature: I7.1:LIFE_J ---
mean std min max count
final_cluster
0 0.357798 0.480087 0 1 327
1 0.285714 0.456435 0 1 49
2 0.283489 0.450926 0 1 963
--- Feature: I7.2:LIFE_B ---
mean std min max count
final_cluster
0 0.211009 0.408650 0 1 327
1 0.367347 0.487078 0 1 49
2 0.112150 0.315714 0 1 963
--- Feature: I7.3:F_LIGHT ---
mean std min max count
final_cluster
0 0.804281 0.397361 0 1 327
1 0.551020 0.502545 0 1 49
2 0.861890 0.345195 0 1 963
--- Feature: I7.4:CP ---
mean std min max count
final_cluster
0 0.409786 0.492548 0 1 327
1 0.306122 0.465657 0 1 49
2 0.296989 0.457169 0 1 963
--- Feature: I7.5:OTHERS ---
mean std min max count
final_cluster
0 0.070336 0.256105 0 1 327
1 0.387755 0.492287 0 1 49
2 0.049844 0.217736 0 1 963
--- Feature: I8.1:BAD_W ---
mean std min max count
final_cluster
0 0.932722 0.250887 0 1 327
1 0.571429 0.500000 0 1 49
2 0.900312 0.299740 0 1 963
--- Feature: I8.2:FISH_R ---
mean std min max count
final_cluster
0 0.226300 0.419077 0 1 327
1 0.285714 0.456435 0 1 49
2 0.211838 0.408823 0 1 963
--- Feature: I8.3:BOAT_P ---
mean std min max count
final_cluster
0 0.529052 0.499920 0 1 327
1 0.306122 0.465657 0 1 49
2 0.590862 0.491930 0 1 963
--- Feature: I8.4:FISH_COMP ---
mean std min max count
final_cluster
0 0.330275 0.471033 0 1 327
1 0.204082 0.407206 0 1 49
2 0.313603 0.464198 0 1 963
--- Feature: I8.5:OTHERS ---
mean std min max count
final_cluster
0 0.070336 0.256105 0 1 327
1 0.204082 0.407206 0 1 49
2 0.063344 0.243707 0 1 963
--- Feature: Y_BOAT-RE ---
mean std min max count
final_cluster
0 2017.269113 0.825807 2015.0 2023.0 327
1 2017.040816 0.351140 2016.0 2018.0 49
2 2017.251298 0.841483 2014.0 2023.0 963
--- Feature: NY_W/BOAT ---
mean std min max count
final_cluster
0 6.730887 0.825807 1.0 9.0 327
1 6.959184 0.351140 6.0 8.0 49
2 6.748702 0.841483 1.0 10.0 963
--- Feature: BOAT_COND ---
mean std min max count
final_cluster
0 1.880734 0.324598 1.0 2.0 327
1 1.653061 0.480929 1.0 2.0 49
2 1.865005 0.341896 1.0 2.0 963
--- Feature: J1:BOAT_AGREE ---
mean std min max count
final_cluster
0 1.877676 0.328162 1.0 2.0 327
1 1.897959 0.305839 1.0 2.0 49
2 1.881620 0.323225 1.0 2.0 963
--- Feature: J2:BOAT_TYPE ---
mean std min max count
final_cluster
0 1.018349 0.134414 1.0 2.0 327
1 1.020408 0.142857 1.0 2.0 49
2 1.034268 0.182011 1.0 2.0 963
--- Feature: J3:BOAT_DESIGN ---
mean std min max count
final_cluster
0 3.871560 0.670604 1.0 6.0 327
1 3.897959 0.510102 1.0 4.0 49
2 3.825545 0.748312 1.0 6.0 963
--- Feature: J4:BOAT_COND ---
mean std min max count
final_cluster
0 1.180428 0.385133 1.0 2.0 327
1 1.326531 0.473804 1.0 2.0 49
2 1.201454 0.401295 1.0 2.0 963
--- Feature: J4: REASON-NO ---
mean std min max count
final_cluster
0 0.412844 0.970842 0.0 4.0 327
1 0.612245 1.095756 0.0 4.0 49
2 0.440291 0.979688 0.0 4.0 963
--- Feature: J5.1 ---
mean std min max count
final_cluster
0 4.675841 0.882062 1.0 5.0 327
1 4.857143 0.408248 3.0 5.0 49
2 4.656282 0.882511 1.0 5.0 963
--- Feature: J5.2 ---
mean std min max count
final_cluster
0 4.718654 0.730821 1.0 5.0 327
1 4.714286 0.763763 1.0 5.0 49
2 4.759086 0.640059 1.0 5.0 963
--- Feature: J5.3 ---
mean std min max count
final_cluster
0 2.076453 0.823521 1.0 5.0 327
1 2.163265 1.086795 1.0 5.0 49
2 2.018692 0.860102 1.0 5.0 963
--- Feature: J5.4 ---
mean std min max count
final_cluster
0 4.785933 0.629089 1.0 5.0 327
1 4.775510 0.770965 1.0 5.0 49
2 4.818276 0.561404 1.0 5.0 963
--- Feature: J5.5 ---
mean std min max count
final_cluster
0 4.740061 0.719801 1.0 5.0 327
1 4.816327 0.666879 1.0 5.0 49
2 4.773624 0.664398 1.0 5.0 963
--- Feature: J5.6 ---
mean std min max count
final_cluster
0 4.507645 1.148126 1.0 5.0 327
1 4.408163 1.223355 1.0 5.0 49
2 4.356179 1.298020 1.0 5.0 963
--- Feature: J5.7 ---
mean std min max count
final_cluster
0 4.767584 0.669989 1.0 5.0 327
1 4.632653 0.928571 1.0 5.0 49
2 4.811007 0.607989 1.0 5.0 963
--- Feature: J6:AVE_FBP-IMPT ---
mean std min max count
final_cluster
0 3.984037 0.389823 2.00 5.0 327
1 4.081633 0.394902 2.29 5.0 49
2 4.003292 0.359941 2.00 5.0 963
--- Feature: J6.1 ---
mean std min max count
final_cluster
0 4.737003 0.690386 1.0 5.0 327
1 4.632653 0.882560 1.0 5.0 49
2 4.784008 0.564981 1.0 5.0 963
--- Feature: J6.2 ---
mean std min max count
final_cluster
0 3.993884 0.678296 1.0 5.0 327
1 3.775510 1.085229 1.0 5.0 49
2 4.121495 0.675365 1.0 5.0 963
--- Feature: J6.3 ---
mean std min max count
final_cluster
0 4.773700 0.589410 1.0 5.0 327
1 4.734694 0.531331 3.0 5.0 49
2 4.788162 0.553561 1.0 5.0 963
--- Feature: J6.4 ---
mean std min max count
final_cluster
0 4.767584 0.602491 1.0 5.0 327
1 4.714286 0.763763 2.0 5.0 49
2 4.802700 0.558913 1.0 5.0 963
--- Feature: J6.5 ---
mean std min max count
final_cluster
0 4.767584 0.597378 1.0 5.0 327
1 4.816327 0.486204 3.0 5.0 49
2 4.800623 0.531463 1.0 5.0 963
--- Feature: J6.6 ---
mean std min max count
final_cluster
0 4.721713 0.712887 1.0 5.0 327
1 4.836735 0.472005 3.0 5.0 49
2 4.777778 0.601837 1.0 5.0 963
--- Feature: J6:AVE_FBP-PERC ---
mean std min max count
final_cluster
0 4.529541 0.477845 1.00 5.0 327
1 4.511633 0.494913 2.67 5.0 49
2 4.590312 0.440021 1.00 5.0 963
--- Feature: J7.1 ---
mean std min max count
final_cluster
0 4.532110 1.115167 1.0 5.0 327
1 4.387755 1.304101 1.0 5.0 49
2 4.390447 1.265729 1.0 5.0 963
--- Feature: J7.2 ---
mean std min max count
final_cluster
0 4.648318 0.811264 1.0 5.0 327
1 4.632653 0.858630 1.0 5.0 49
2 4.715472 0.694528 1.0 5.0 963
--- Feature: J7.3 ---
mean std min max count
final_cluster
0 4.703364 0.683339 1.0 5.0 327
1 4.653061 0.778648 1.0 5.0 49
2 4.718588 0.675328 1.0 5.0 963
--- Feature: J7.4 ---
mean std min max count
final_cluster
0 4.770642 0.595994 1.0 5.0 327
1 4.673469 0.774267 1.0 5.0 49
2 4.767394 0.601364 1.0 5.0 963
--- Feature: J7.5 ---
mean std min max count
final_cluster
0 4.770642 0.758991 1.0 5.0 327
1 4.673469 1.028505 1.0 5.0 49
2 4.663551 0.996350 1.0 5.0 963
--- Feature: J7:AVE_FBP-CONT ---
mean std min max count
final_cluster
0 4.207951 0.411861 2.6 5.0 327
1 4.244898 0.497100 2.4 5.0 49
2 4.216615 0.481567 1.0 5.0 963
In [161]:
sns.set(style="whitegrid")
PALETTE = sns.color_palette("husl", len(DF['final_cluster'].unique()))
for FEATURE in DF.columns:
if FEATURE == 'final_cluster':
continue
plt.figure(figsize=(8, 4))
JITTER = np.random.normal(0, 0.03, size=len(DF))
sns.scatterplot(
x=DF[FEATURE],
y=JITTER,
hue=DF['final_cluster'],
palette=PALETTE,
marker='o',
edgecolor='w',
s=70
)
plt.title(f'Cluster Distribution by Feature: {FEATURE}')
plt.xlabel(FEATURE)
plt.ylabel('Jitter (for visualization only)')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()